# -*- coding: utf-8 -*-
import re
from datetime import datetime

import scrapy
from scrapy import Request
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

from SinaWeiboSpider.items import RelationshipsItem


class SinaAccountFansFollowSpider(scrapy.Spider):
    name = 'sina_account_fans_follow'
    allowed_domains = ['weibo.cn']
    base_url = 'http://weibo.cn/'

    def start_requests(self):
        start_uids = ['5226257511', ]
        for uid in start_uids:
            # # 获取关注列表
            yield Request(url=f'{self.base_url}{uid}/follow?page=1', callback=self.parse_follow, dont_filter=True)
            # 获取粉丝列表
            yield Request(url=f'{self.base_url}{uid}/fans?page=1', callback=self.parse_fans, dont_filter=True)

    def parse_follow(self, response):
        """
        抓取关注列表
        """
        # 如果是第1页，一次性获取后面的所有页
        if response.url.endswith('page=1'):
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1', 'page={}'.format(page_num))
                    yield Request(page_url, self.parse_follow, dont_filter=True, meta=response.meta)
        urls = response.xpath('//a[text()="关注他" or text()="关注她" or text()="取消关注"]/@href').extract()
        uids = re.findall('uid=(\d+)', ";".join(urls), re.S)
        ID = re.findall('(\d+)/follow', response.url)[0]
        for uid in uids:
            relationships_item = RelationshipsItem()
            relationships_item['crawl_time'] = datetime.now().strftime('%Y-%m-%d %X')
            relationships_item["fan_id"] = ID
            relationships_item["followed_id"] = uid
            relationships_item["_id"] = ID + '-' + uid
            yield relationships_item

    def parse_fans(self, response):
        """
        抓取粉丝列表
        """
        # 如果是第1页，一次性获取后面的所有页
        if response.url.endswith('page=1'):
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1', 'page={}'.format(page_num))
                    yield Request(page_url, self.parse_fans, dont_filter=True, meta=response.meta)
        urls = response.xpath('//a[text()="关注他" or text()="关注她" or text()="移除"]/@href').extract()
        uids = re.findall('uid=(\d+)', ";".join(urls), re.S)
        ID = re.findall('(\d+)/fans', response.url)[0]
        for uid in uids:
            relationships_item = RelationshipsItem()
            relationships_item['crawl_time'] = datetime.now().strftime('%Y-%m-%d %X')
            relationships_item["fan_id"] = uid
            relationships_item["followed_id"] = ID
            relationships_item["_id"] = uid + '-' + ID
            yield relationships_item


if __name__ == '__main__':
    process = CrawlerProcess(get_project_settings())
    process.crawl('sina_account_fans_follow')
    process.start()